In [1]:
from top2vec import Top2Vec
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import json
In [47]:
df = pd.read_csv('../data/comments.csv').dropna(subset=['text']).reset_index(drop=True)
In [16]:
model = Top2Vec.load('../trained_model_2')
In [3]:
model.documents
Out[3]:
array(['It wasn\'t just me!Though it was interesting that googling "google sheets down" and clicking News returned 3 results regarding winter weather bedding and 0 results about Google Sheets.',
       'up for me now. Google SREs must be busy today!',
       'Drive is down as well.', ...,
       'Location: Toronto, ON, Canada (Canadian citizen)Remote: YesWilling to relocate: YesTechnologies:-  Languages: Python, MATLAB, Fortan, (some experience: shell scripting, C++)-  Packages: NumPy, SciPy, sklearn, Pandas, Keras, Tensorflow, \x85-  Other: Git, (some experience: Docker, SQL)Resume/CV: send me a message via email or LinkedIn: www.linkedin.com/in/amouravEmail: andrei@amourav.comPortfolio: https://github.com/amouravBackground: BSc (physics), MSc (medical biophysics \x96 thesis: MRI + ML)Some Interests: Computer Vision, NLP, Medical Image Analysis, Biomedical Signal Processing \x85I\x92ve recently completed my MSc in Medical Biophysics at the University of Toronto where I applied state of the art computer vision methods for automating aspects of radiology workflow. My graduate work focused on the development of deep learning methods for automated brain tumour segmentation and more classical machine learning techniques for prediction of patient response to radiotherapy from MRI data. In my most recent contract work, I worked with satellite data for agricultural analysis.TLDR: Recent graduate with some industry experience looking for a full/part-time data scientist / machine learning developer position.Thanks for reading. Shoot me an email if you want to chat :)',
       "Location: CaliforniaRemote: YesWilling to relocate: YesTechnologies: IT Ops/Strategy, PM, M&A, EDI, ERP, SAP, Azure/AWS, HCI, Virtualization, IaaS/PaaS, SD-WAN, InfoSec/Compliance (GDPR, ISO27001, SOX, PCI), Telephony, Asset Management, Enterprise WirelessRésumé/CV: https://www.linkedin.com/in/wasiahmed/Email: ITBusinessAdvocate@gmail.comI've been the Director of IT for Fortune 500 companies and love working with teams and scaling them/capabilities across different time zones and cultural contexts.",
       'Looking for software marketing position remote or in-personLocation: Boston, MA\nRemote: Yes\nLooking For: Full or part-time\nWilling to relocate: YesExperience/Skills: 4+ years in direct marketing and inbound lead gen, article/publication writing, ad and branding design, + investor relations/PRApps/Tools: social media advertising/campaigns, google ads, StatTracker, Amplitude, Appstore Connect Goog analytics, Buffer, MailChimp, Adobe suite, Canva, Figma, Medium, Salesforce, Zendesk, PitchbookRésume: On demandEmail: mbloom1915@gmail.com'],
      dtype=object)
In [4]:
topic_words, word_scores, topic_nums = model.get_topics()
In [5]:
topic_words
Out[5]:
array([['devops', 'stack', 'senior', ..., 'sr', 'django', 'recruiting'],
       ['students', 'universities', 'college', ..., 'lecture',
        'bachelors', 'grading'],
       ['ycombinator', 'item', 'id', ..., 'article', 'guidelines',
        'duplicate'],
       ...,
       ['federated', 'alternative', 'alternatives', ..., 'trustworthy',
        'hosted', 'proprietary'],
       ['eta', 'buy', 'buying', ..., 'drm', 'brand', 'ikea'],
       ['fullstack', 'berlin', 'cordova', ..., 'warsaw', 'onsite',
        'worklocation']], dtype='<U15')
In [7]:
model.topic_vectors
Out[7]:
array([[ 0.05727052,  0.00896881,  0.04296499, ..., -0.04414118,
        -0.02239947,  0.03427007],
       [ 0.10259358,  0.05204452,  0.15780123, ..., -0.06267212,
         0.06077799, -0.0361257 ],
       [ 0.1190334 ,  0.09148788,  0.0296447 , ..., -0.06155116,
         0.10000756, -0.18096188],
       ...,
       [ 0.11334322, -0.10704701, -0.03682499, ..., -0.06056097,
        -0.13479881,  0.00815639],
       [ 0.05718849,  0.06760921, -0.03789532, ..., -0.0319731 ,
        -0.00528994,  0.00432624],
       [ 0.13691679,  0.0904028 ,  0.1405135 , ..., -0.04356986,
        -0.01171116,  0.03768642]], dtype=float32)

Topic reduction (to 20 segments)

In [1]:
model.hierarchical_topic_reduction(20)
In [63]:
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
In [65]:
topic_nums
Out[65]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])
In [71]:
for topic in topic_nums:
    plt.figure()
    model.generate_topic_wordcloud(topic, reduced=True)
    plt.savefig(f'../img/cloud{topic}.png')
<ipython-input-71-3445bec89f5f>:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure()
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

Creating mappings between topics and users

In [66]:
topic_sizes = model.topic_sizes_reduced
In [67]:
topic_sizes
Out[67]:
0     24076
1     21682
2     21369
3     20823
4     20819
5     20621
6     19772
7     18729
8     18663
9     18279
10    17893
11    16585
12    15730
13    14629
14    14534
15    14437
16    14212
17    13867
18    13184
19    13181
dtype: int64
In [68]:
mapping_dict = {}

for topic in topic_nums:
    current_list = []
    documents, doc_scores, doc_ids = model.search_documents_by_topic(topic_num=topic, num_docs=topic_sizes[topic], reduced=True)
    for doc_id in doc_ids:
        current_list.append(df.iloc[doc_id, 0])
        
    mapping_dict[str(topic)] = current_list
In [70]:
with open('topic_user_mapping.json', 'w') as f:
    json.dump(mapping_dict, f)
In [75]:
model.get_num_topics(reduced=True)
Out[75]:
20
In [76]:
model.save('trained_model2_reduced')
In [90]:
date_counts = pd.to_datetime(df['time']).dt.floor('d').value_counts()
In [99]:
date_counts = date_counts.sort_index()
In [101]:
date_counts
Out[101]:
2019-12-07        1
2019-12-08        3
2019-12-09       19
2019-12-10     1153
2019-12-11     1350
              ...  
2020-09-19     7843
2020-09-20     7552
2020-09-21    11110
2020-09-22    11281
2020-09-23     3822
Name: time, Length: 236, dtype: int64
In [100]:
px.line(date_counts, y="time")
In [199]:
topic_vectors = model.topic_vectors_reduced
In [200]:
topic_vectors
Out[200]:
array([[-0.02582142, -0.04165698, -0.16124064, ...,  0.1008816 ,
        -0.07319082,  0.06825463],
       [-0.06826892, -0.01840919, -0.04894271, ...,  0.03845696,
         0.12143672,  0.0702476 ],
       [ 0.06288733, -0.03155832,  0.06969699, ...,  0.03567478,
         0.02518485,  0.05868496],
       ...,
       [ 0.24867156, -0.07170543, -0.15194356, ...,  0.02992789,
         0.03976055,  0.10660741],
       [-0.10852522, -0.04511455, -0.13966589, ..., -0.07761919,
         0.05752036,  0.04622773],
       [ 0.05148629,  0.00394255, -0.03625961, ..., -0.04633759,
         0.03854463,  0.08910043]], dtype=float32)
In [145]:
topic_df = pd.DataFrame(topic_vectors)

PCA of topic segments

In [166]:
topic_df.shape
Out[166]:
(20, 300)
In [154]:
pca = PCA(n_components=2)
In [170]:
pca.fit(topic_df.T)
Out[170]:
PCA(n_components=2)
In [161]:
topic_df.shape
Out[161]:
(20, 300)
In [113]:
pca.singular_values_
Out[113]:
array([2.116336 , 1.8549052], dtype=float32)
In [171]:
topics_reduced = pca.components_
In [163]:
topics_reduced.shape
Out[163]:
(2, 300)
In [172]:
topics_reduced
Out[172]:
array([[ 0.19426341,  0.23341061,  0.22749592,  0.25628307,  0.22416247,
         0.20440319,  0.26427862,  0.22713801,  0.20923783,  0.19611755,
         0.2211092 ,  0.21568361,  0.21782911,  0.243604  ,  0.2610298 ,
         0.24571158,  0.12846169,  0.15757117,  0.24806018,  0.24710566],
       [ 0.33383387,  0.04838067, -0.10739743, -0.13143572,  0.1625792 ,
         0.06090812, -0.30473307, -0.06543408,  0.3094204 ,  0.34937477,
         0.18105265,  0.02014749,  0.13866214,  0.04092873, -0.26307398,
        -0.3106055 ,  0.0438354 ,  0.4205305 , -0.2772885 , -0.19205974]],
      dtype=float32)
In [173]:
topics_out = topics_reduced.T
In [174]:
topics = pd.DataFrame(topics_out, columns=['x','y'])
In [179]:
topics['index1'] = topics.index
In [196]:
segments_dict = {
    0: 'programming',
    1: 'art',
    2: 'psychology',
    3: 'mechanics',
    4: 'computers',
    5: 'forum',
    6: 'US election',
    7: 'finanses',
    8: 'writing',
    9: 'operating systems',
    10: 'cybersecurity',
    11: 'publishing',
    12: 'web browsers',
    13: 'science',
    14: 'illneses',
    15: 'equal rights',
    16: 'work',
    17: 'technologies',
    18: 'economy',
    19: 'law'
}
In [ ]:
px.scatter(topics, 'x', 'y', text='index1', opacity=0)